Network Workshop

library(dplyr);library(statnet);library(igraph)
library(igraphdata);library(ergm);library(amen);library(reshape)
library(bayesplot); library(ggplot2);library(gridExtra)
library(mclust);library(repmis)

Intro: Karate Club

First, we can load the karate club data! This comes in as a ‘graph’ object in R

data(karate)

# is it directed? nope
is_directed(karate)
## [1] FALSE

Sometimes (most times) we want the adjacency matrix for our observed graph

karate_mat <- get.adjacency(karate,sparse = FALSE)

# visualizing our adj matrix
heatmap(karate_mat[,34:1],Rowv = NA, Colv = NA,scale="none")

For the karate club data, it comes with a vertex attribute, ‘Faction’. This tells us the ground truth communities for the nodes

# we can get the ground truth
faction <- V(karate)$Faction 
truth   <- make_clusters(karate, V(karate)$Faction) #you can create a 'cluster' type object to be associated with the graph vertices

Calculating centrality scores

deg_centrality  <- centr_degree(karate, mode="in", normalized=TRUE)$res

clos_centrality <- centr_clo(karate, mode="all", normalized=TRUE)$res # mode does NOT matter for undirected graph

eig_centrality  <- centr_eigen(karate, normalized=TRUE)$vector

bet_centrality  <- centr_betw(karate, normalized=TRUE)$res

Plot our graph using centrality

# Set a layout (will discuss more later)
lk <- layout_with_dh(karate)
plot(karate, vertex.size = deg_centrality*2, main = "Degree", layout = lk)

plot(karate, vertex.size = clos_centrality*20, main = "Closeness",
     layout = lk)

plot(karate, vertex.size = eig_centrality*20, main = "Eigen", 
     layout = lk)

plot(karate, vertex.size = bet_centrality/10, main = "Betweenness", 
     layout = lk)

Some Basics

Generating a network (ER)

First, generate a simple network from an ER model (we specify number of nodes and probability of edge)

set.seed(9)
# n: number of nodes, p: prob of edge
n <- 100
p <- .2

# Sample ER 
ER_graph <- sample_gnp(n, p, directed = FALSE, loops = FALSE)

# igraph allows us to easily plot these graph objects
plot(ER_graph, vertex.size = 7) # lots of options here, for now we keep it simple

# create adj mat
ER_adj <- get.adjacency(ER_graph)

Making Graph Objects

There are lots of convenient functions to go from adj. matrices to graphs, edge lists to graphs, etc

graph_from_adj_mat <- graph_from_adjacency_matrix(ER_adj)
edges_from_graph   <- get.edgelist(ER_graph)
graph_from_edges   <- graph_from_edgelist(edges_from_graph)

# Edgelist
head(edges_from_graph)
##      [,1] [,2]
## [1,]    2    3
## [2,]    2    4
## [3,]    4    5
## [4,]    5    6
## [5,]    3    7
## [6,]    4    7

Degree, Degree Distribution, and Density

deg_all <- igraph::degree(ER_graph, mode="all")
deg_out <- igraph::degree(ER_graph, mode="out")
deg_in  <- igraph::degree(ER_graph, mode="in")

deg_all_df <- as.data.frame(deg_all)


# Density of graph
edge_density(ER_graph)
## [1] 0.2181818
ggplot(deg_all_df, aes(deg_all)) + geom_histogram(bins = 10) + xlab("Total Degree") + ylab("Count") + ggtitle("Histogram of node degree")

Visualizing our network

There are lots of ways to change your plots of networks. For now, we are keeping things simple! But more is to come.

plot(ER_graph, vertex.size=deg_all*3) # can size by deg (large deg nodes are bigger)

Setting Layouts

You can set a layout or seed. This is good for when we want to plot the same graph with different attributes or perhaps a time lapse of a network. Setting the layout means that everytime you plot the network, the nodes will be in the same place.

Link for layouts: (https://igraph.org/r/doc/layout_.html)

l  <- layout_with_kk(ER_graph)
l2 <- layout_with_fr(ER_graph)

plot(ER_graph,
     vertex.label = NA,vertex.size =15, edge.curved=0.5,layout=l)

plot(ER_graph, vertex.label.color="black", 
     vertex.label.cex=.5,vertex.label.dist=0.2, vertex.size =15, edge.curved=0.5,layout=l2)

ERGM: Fitting a basic model

Next, we will fit a basic ergm!

# Function to get probability of an edge
getProbFromLO <- function(x){
   exp(x)/(1+exp(x))
}

nchoosek <- function(n,k){
  factorial(n)/(factorial(n-k)*factorial(k))
}

set.seed(9)
model1 <- ergm(ER_adj~edges) 
summary(model1)
## 
## ==========================
## Summary of model fit
## ==========================
## 
## Formula:   ER_adj ~ edges
## 
## Iterations:  5 out of 20 
## 
## Monte Carlo MLE Results:
##       Estimate Std. Error MCMC % z value Pr(>|z|)    
## edges -1.27629    0.02433      0  -52.45   <1e-04 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      Null Deviance: 13724  on 9900  degrees of freedom
##  Residual Deviance: 10387  on 9899  degrees of freedom
##  
## AIC: 10389    BIC: 10396    (Smaller is better.)
getProbFromLO(model1$coef[1]) 
##     edges 
## 0.2181818
# this corresponds to density of graph, mle available
sum(ER_adj)/2/nchoosek(n,2)
## [1] 0.2181818
# now triangles introduce dependence and we use mcmc
set.seed(9)
model2 <- ergm(ER_adj~edges+triangles) 
summary(model2)
## 
## ==========================
## Summary of model fit
## ==========================
## 
## Formula:   ER_adj ~ edges + triangles
## 
## Iterations:  2 out of 20 
## 
## Monte Carlo MLE Results:
##           Estimate Std. Error MCMC % z value Pr(>|z|)    
## edges    -1.560110   0.115356      0 -13.524  < 1e-04 ***
## triangle  0.015096   0.005795      0   2.605  0.00919 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      Null Deviance: 13724  on 9900  degrees of freedom
##  Residual Deviance: 10383  on 9898  degrees of freedom
##  
## AIC: 10387    BIC: 10401    (Smaller is better.)

\(\hat{\beta}_1*\)change in edges + \(\hat{\beta}_2*\)change in triangles

Edge that creates no new triangles, the conditional log-odds is: \(\hat{\beta}_1\) (which yields probability \(exp(\hat{\beta}_1)/(1+exp(\hat{\beta}_1))\)):

getProbFromLO(model2$coef[1]) 
##     edges 
## 0.1736308

One edge that creates one triangle: \((\hat{\beta}_1 + \hat{\beta}_2 \times 1)\) with corresponding probability:

getProbFromLO(model2$coef[1] + model2$coef[2]) 
##     edges 
## 0.1758076

One edge that creates two triangles: \((\hat{\beta}_1 + \hat{\beta}_2 \times 2)\) with corresponding probability:

getProbFromLO(model2$coef[1] + model2$coef[2]*2) 
##     edges 
## 0.1780057

where the probability is interpreted as the probability that two nodes are connected given we add an edge

Stochastic Block Model

Generate a network using a stochastic block model with 3 communities and plot it (colored by true community)

Feel free to enter your own parameters! Play around and see what happens

n = 300        # number of nodes you want, pick something kind of small < 1000
p = rep(1/3,3) # vector with 3 entries that sum up to 1 (probability of belonging to each community)
B = matrix(c(.2, .01, .05,
             .01,.2, .1,
             .05, .1, .2), nrow = 3, byrow =TRUE) # 3 by 3 symmetric matrix that indicates probability of connection between and within communities (pick what you want!)
blockSizes = n*p

# sample a network coming from an SBM
sampleSBM <- sample_sbm(n = n, pref.matrix =B , block.sizes =blockSizes , directed = FALSE, loops = FALSE) 
# get adj mat
adj_mat_SBM <- get.adjacency(sampleSBM, sparse = FALSE) # turn graph object into matrix


# Function to plot adj mat... you could use image() as well
matrixPlotter <- function(matrix, col2="lightcoral", col1= "lightcyan2",title = "", alpha_num = .8,n){
 p <-  ggplot(melt(matrix), aes(x = X1, y = X2, fill = as.factor(value))) +
   geom_raster() +ggtitle(title)+ xlab("") + ylab("")+
   theme_bw() +
   labs(fill = "")+
   theme(
     aspect.ratio = 1,
     panel.grid = element_blank(), 
     plot.title = element_text(hjust = 0.5),
    # axis.line.x = element_line(),
     panel.border = element_rect(size=2,linetype="solid",color="gray")
     #panel.border=element_rect(fill=FALSE),axis.line.y = element_line()
     )+  scale_fill_manual(values = c(col1, col2))+
   scale_x_continuous(limits = c(0,n), expand = c(0, 0)) +
  scale_y_continuous(limits = c(0,n), expand = c(0, 0)) 
 return(p) 
}

matrixPlotter(adj_mat_SBM,n=n)

# Create true labels 
trueLabels <- c(rep(1, n*p[1]), rep(2, n*p[2]), rep(3, n*p[3])) 

# Create a graph from our adj mat
graph_SBM <- graph_from_adjacency_matrix(adj_mat_SBM)

# Add an attribute
V(graph_SBM)$trueLabels <- as.factor(trueLabels)

# Plot the graph
plot(graph_SBM, vertex.color = V(graph_SBM)$trueLabels )

Community Detection

Spectral Methods

eigen_sbm <- eigen(adj_mat_SBM)
sort_eig_vals <- order(abs(eigen_sbm$values), decreasing = TRUE)
X_hat <- eigen_sbm$vectors[,sort_eig_vals[1:3]]%*%diag(eigen_sbm$values[sort_eig_vals[1:3]]^(1/2))

k_means_X_hat <- kmeans(X_hat, centers = 3)
k_means_labels <- k_means_X_hat$cluster



sbm_spectral_df <- as.data.frame(cbind(X_hat, trueLabels, k_means_labels))
sbm_spectral_df[,4] <- as.factor(sbm_spectral_df[,4])
sbm_spectral_df[,5] <- as.factor(sbm_spectral_df[,5])


colnames(sbm_spectral_df) <- c("V1", "V2", "V3", "TrueLabel", "Kmeans")
ggplot(sbm_spectral_df, aes(x = V1, y = V2, col = TrueLabel) ) + geom_point()

ggplot(sbm_spectral_df, aes(x = V1, y = V2, col = Kmeans) ) + geom_point()

Exercise: Try looking at the non scaled eigenvectors! Also, plot different vectors against each other and see if results appear different visually

Uninformed GMM

uninformed_GMM <- Mclust(X_hat,G = 3 )
sbm_spectral_df$GMM <- as.factor(uninformed_GMM$classification)
ggplot(sbm_spectral_df, aes(x = V1, y = V2, col = GMM) ) + geom_point()

Girvan-Newman algorithm: Karate Club Data

The number of shortest paths passing through edges between similar communities is low while edges through different communities are high

dendrogram <- cluster_edge_betweenness(karate)
dendrogram
## IGRAPH clustering edge betweenness, groups: 6, mod: 0.35
## + groups:
##   $`1`
##   [1] "Mr Hi"    "Actor 2"  "Actor 4"  "Actor 8"  "Actor 12" "Actor 13"
##   [7] "Actor 18" "Actor 20" "Actor 22"
##   
##   $`2`
##   [1] "Actor 3"  "Actor 10" "Actor 14" "Actor 29"
##   
##   $`3`
##   [1] "Actor 5"  "Actor 6"  "Actor 7"  "Actor 11" "Actor 17"
##   
##   + ... omitted several groups/vertices

Plotting Results

plot_dendrogram(dendrogram) # for hierarchical structure

membership(dendrogram) # best cut in terms of modularity
##    Mr Hi  Actor 2  Actor 3  Actor 4  Actor 5  Actor 6  Actor 7  Actor 8 
##        1        1        2        1        3        3        3        1 
##  Actor 9 Actor 10 Actor 11 Actor 12 Actor 13 Actor 14 Actor 15 Actor 16 
##        4        2        3        1        1        2        4        4 
## Actor 17 Actor 18 Actor 19 Actor 20 Actor 21 Actor 22 Actor 23 Actor 24 
##        3        1        4        1        4        1        4        5 
## Actor 25 Actor 26 Actor 27 Actor 28 Actor 29 Actor 30 Actor 31 Actor 32 
##        5        5        6        5        2        6        4        4 
## Actor 33   John A 
##        4        4
V(karate)[Faction == 1]$shape <- "circle"
V(karate)[Faction == 2]$shape <- "square"
plot(dendrogram,karate)

We know there are 2 communities though so we can make a cut.

Misclustering Rate

dendo2 <-cut_at(dendrogram,no = 2) # cut into two groups

# get number of misclustered nodes
misclust1 <- length(which(dendo2 != faction))
misclust1
## [1] 32
# consider label switching
dendo2alt <- ifelse(dendo2==1,2,1) 
misclust2 <-  length(which(dendo2alt != faction))
misclust2
## [1] 2
min(misclust1, misclust2)/length(dendo2)
## [1] 0.05882353
# plot by estimated community
V(karate)$dendo2est <- dendo2alt
plot(karate, vertex.color = V(karate)$dendo2est)

AddHealth Data

  • We are going to take a look at an AddHealth network! This particular version gives us gender, race, and grade

  • Things to also note: this is NOT a binary network!

  • AMEN can’t deal with missing covariates so, for simplicity, we are just removing the people with NA. We can use imputation methods, but we will not worry about that for now

data(addhealthc3)

# finds people with missing covar info
na_people <- which(rowSums(is.na(addhealthc3$X)) > 0) 
ah_net <- addhealthc3$Y[-na_people, -na_people]
ah_covar <- addhealthc3$X[-na_people,]
ah_covar_df <- as.data.frame(ah_covar)

## create an igraph object
addhealth_g <- graph_from_adjacency_matrix(ah_net)
V(addhealth_g)$gender <- as.factor(ah_covar[,1])
V(addhealth_g)$color <- ifelse(ah_covar[,1]==1, "pink", "blue")

Exploratory Data Analysis

gg1 <- ggplot(ah_covar_df, aes(as.factor(race)))  + 
  geom_histogram(stat ="count") + 
  xlab("Race")+ 
  scale_x_discrete(labels = c("White", "Hispanic", "Other"))

gg2 <- ggplot(ah_covar_df, aes(as.factor(female))) + 
  geom_histogram(stat ="count")+ 
  xlab("Female")

gg3 <- ggplot(ah_covar_df, aes(as.factor(grade)))  + 
  geom_histogram(stat ="count") +
  xlab("Grade")

grid.arrange(gg1,gg2,gg3)

plot(addhealth_g, vertex.color =V(addhealth_g)$gender)

plot(addhealth_g, vertex.color =V(addhealth_g)$color)

### two factor plot
ah_covar_df$female <- as.factor(ah_covar_df$female)
ah_covar_df$grade  <- as.factor(ah_covar_df$grade)

countdf <- data.frame(table(ah_covar_df$female,ah_covar_df$grade))
names(countdf) <- c("female","grade","Count")
ggplot(data=countdf, 
       aes(x=female, y=Count, fill=grade)) + 
  geom_bar(stat="identity")

AddHealth Degree Distribution

####
diag(ah_net)<-0
degree_dist_out <- rowSums(ah_net)
degree_dist_in  <- colSums(ah_net)

degree_df <- as.data.frame(c(degree_dist_out, degree_dist_in))
degree_df$deg <- c(rep("Out Degree",length(degree_dist_out)), 
                  rep("In Degree", length(degree_dist_in)))
colnames(degree_df) <- c("Count", "Degree")

df2 <- as.data.frame(cbind(rowMeans(ah_net), colMeans(ah_net)))
df2$idx <- seq(1, dim(ah_net)[1], 1)
colnames(df2) <- c("RowMeans", "ColMeans", "idx")

melt_df2 <- melt(df2, id = "idx")

avg_deg <- ggplot(melt_df2, aes(x = idx, y =value, col = variable)) + 
  geom_point(size = 3) +
  xlab("ID") +
  ylab("Average Degree") + 
  guides(col = guide_legend(title = ""))

deg_corr <- ggplot(df2, aes(x = RowMeans, y = ColMeans)) +
  geom_point(size = 3) + 
  xlab("Row Means: Out Degree") + 
  ylab("Col Means: In Degree")

grid.arrange(avg_deg, deg_corr, nrow = 1)


Question

What can we say about possible correlation between row and column means?

Using AMEN

Fitting a basic amen model. Note that this network has an FRN likelihood. First, we fit no multiplicative effects:

amen_basic <- ame(Y = ah_net, 
             Xrow = ah_covar,
             Xcol = ah_covar,
             dcor = TRUE, rvar = TRUE, cvar = TRUE,
             R = 0, nscan = 2000, burn = 500, 
             model = "frn", plot = FALSE)
summary(amen_basic)
## 
## Regression coefficients:
##             pmean   psd z-stat p-val
## intercept  -3.879 0.890 -4.360 0.000
## female.row -0.371 0.241 -1.542 0.123
## race.row   -0.055 0.110 -0.498 0.619
## grade.row   0.135 0.067  2.013 0.044
## female.col -0.258 0.180 -1.435 0.151
## race.col   -0.044 0.084 -0.524 0.600
## grade.col   0.175 0.055  3.166 0.002
## 
## Variance parameters:
##     pmean   psd
## va  0.296 0.148
## cab 0.012 0.063
## vb  0.185 0.083
## rho 0.793 0.056
## ve  1.000 0.000
# look at posterior credible intervals
post_basic <- apply(amen_basic$BETA,2,function(x)quantile(x,c(.025,.5,.975)))

Making dyadic covariates

Consider creating dyadic covariates to indicate whether or not two people share particular attributes.

n <- dim(ah_covar_df)[1]
dyadic_covar <- array(0, dim = c(n,n,3))

same_grade <- matrix(0, nrow = n, ncol = n)
same_gender <- matrix(0, nrow = n, ncol = n)
same_race <- matrix(0, nrow = n, ncol = n)

for(i in 1:n){
  for(j in 1:n){
    same_grade[i,j]  <- ifelse(ah_covar_df$grade[i]== ah_covar_df$grade[j],1,0)
    
    same_gender[i,j] <- ifelse(ah_covar_df$female[i]== ah_covar_df$female[j],1,0)
    
    same_race[i,j]   <- ifelse(ah_covar_df$race[i]== ah_covar_df$race[j],1,0)
  }
}

dyadic_covar[,,1] <- same_grade
dyadic_covar[,,2] <- same_gender
dyadic_covar[,,3] <- same_race


amen_dyad <- ame(Y = ah_net, 
             Xrow = ah_covar,
             Xcol = ah_covar,
             Xdyad = dyadic_covar,
             dcor = TRUE, rvar = TRUE, cvar = TRUE,
             R = 0, nscan = 2000, burn = 500, 
             model = "frn", plot = FALSE)


# look at posterior credible intervals
post_dyad <- apply(amen_dyad$BETA,2,function(x)quantile(x,c(.025,.5,.975)))
summary(amen_dyad)
## 
## Regression coefficients:
##             pmean   psd z-stat p-val
## intercept  -3.336 1.056 -3.158 0.002
## female.row -0.480 0.253 -1.897 0.058
## race.row   -0.207 0.143 -1.450 0.147
## grade.row   0.117 0.068  1.733 0.083
## female.col -0.356 0.239 -1.492 0.136
## race.col   -0.206 0.139 -1.482 0.138
## grade.col   0.192 0.058  3.321 0.001
## .dyad       1.261 0.203  6.218 0.000
## .dyad       0.088 0.166  0.530 0.596
## .dyad      -0.526 0.356 -1.481 0.139
## 
## Variance parameters:
##     pmean   psd
## va  0.364 0.191
## cab 0.039 0.088
## vb  0.225 0.089
## rho 0.729 0.060
## ve  1.000 0.000

Multiplicative Effects

ah_binary <- 1*(ah_net>0)
diag(ah_binary) <- 0
decomp_ah <- svd(ah_binary)
plot(decomp_ah$d)

What rank should we pick?

#Elbow Method for finding the optimal number of clusters
set.seed(123)

# Compute and plot wss for k = 2 to k = 15.
k_max <- 15
data <- ah_binary
wss <- sapply(1:k_max, 
              function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss})
plot(1:k_max, wss,
     type="b", pch = 19, frame = FALSE, 
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")


Question

What might be some appopriate ranks to try?

More Complicated Models

Lets try fitting a few different multi effect models

amen_R1 <- ame(Y = ah_net, 
             Xrow = ah_covar,
             Xcol = ah_covar,
             Xdyad = dyadic_covar,
             dcor = TRUE, rvar = TRUE, cvar = TRUE,
             R = 1, nscan = 200, burn = 500,
             model = "frn", plot = FALSE)
# look at posterior credible intervals
post_R1 <- apply(amen_R1$BETA,2,function(x)quantile(x,c(.025,.5,.975)))

amen_R2 <- ame(Y = ah_net, 
             Xrow = ah_covar,
             Xcol = ah_covar,
             Xdyad = dyadic_covar,
             dcor = TRUE, rvar = TRUE, cvar = TRUE,
             R = 2, nscan = 2000, burn = 500,
             model = "frn", plot = FALSE)

pred_net_R2 <- 1*(amen_R2$EZ>0)
post_R2 <- apply(amen_R2$BETA,2,function(x)quantile(x,c(.025,.5,.975)))

amen_R3 <- ame(Y = ah_net, 
             Xrow = ah_covar,
             Xcol = ah_covar,
             Xdyad = dyadic_covar,
             dcor = TRUE, rvar = TRUE, cvar = TRUE,
             R = 3, nscan = 5000, burn = 500, 
             model = "frn", plot = FALSE)

post_R3 <- apply(amen_R3$BETA,2,function(x)quantile(x,c(.025,.5,.975)))

pred_net_R3 <- 1*(amen_R3$EZ>0)

amen_R4 <- ame(Y = ah_net, 
             Xrow = ah_covar,
             Xcol = ah_covar,
             Xdyad = dyadic_covar,
             dcor = TRUE, rvar = TRUE, cvar = TRUE,
             R = 4, nscan = 5000, burn = 500, 
             model = "frn", plot = FALSE)

post_R4 <- apply(amen_R4$BETA,2,function(x)quantile(x,c(.025,.5,.975)))
pred_net_R4 <- 1*(amen_R4$EZ>0)

Diagnostics

# Plot 95% CI for beta
posteriorBeta <- 
as.data.frame(rbind(t(post_dyad),
      t(post_R1),
      t(post_R2),
      t(post_R3),
      t(post_R4)
     ))

modelVec <- c("dyad basic", "R = 1", "R = 2", "R = 3", "R = 4")

tmprownames <- rownames(t(post_dyad))
tmprownames[8:10] <- c("grade.dyad", "gender.dyad", "race.dyad")
posteriorBeta$variable <- rep(tmprownames,5)
posteriorBeta$model    <- c(rep(modelVec, each = 10))
rownames(posteriorBeta) <- NULL

colnames(posteriorBeta) <- c("l", "m","h", "variable", "model")

## plot beta estimates
posteriorBeta <- subset(posteriorBeta, variable != "intercept")

ggplot(posteriorBeta, aes(x= variable, y= m, col =model, group =model))+ 
  geom_errorbar(width = .8, size = 2.5, 
                aes(ymin= l, ymax = h, group = model), 
                position = position_dodge(width = .5))+ 
  geom_point(size = 6, position=position_dodge(width=.5))+xlab("")+ylab("Beta")+
  geom_hline(aes(yintercept = 0)) + ggtitle("95% Posterior CI for Beta")

compare_models_GOF(modelList,modelnames)


Question

Consider playing around with some other AMEN models and pick your favorite. Why did you pick this model?

GoT Data!

Getting our data!

What exactly is this game of thrones data? It is a little vague with how connections are being defined… however, it comes from this website:

https://datascienceplus.com/network-analysis-of-game-of-thrones/

(it is mainly looking at interactions by family ties)

source_data("https://github.com/ShirinG/blog_posts_prep/blob/master/GoT/union_characters.RData?raw=true")
## [1] "union_characters"
source_data("https://github.com/ShirinG/blog_posts_prep/blob/master/GoT/union_edges.RData?raw=true")
## [1] "union_edges"

Make our adjacency matrix

union_graph <- graph_from_data_frame(union_edges, directed = TRUE, vertices = union_characters)
got_A       <- as.matrix(as_adj(union_graph))

We are only going to look at 9 houses

# get top 9 houses
house_subset_labels     <- unique(union_characters$house2)[-1]

# subset characters that belong to those houses
subset_characters       <- subset(union_characters, house %in% house_subset_labels) 

# subset edges so that we only have connections involving the subsetted 9
subset_type             <- subset(union_edges, source %in% subset_characters$name & target %in% subset_characters$name)

# create subsetted version of graph
got_G_subset            <-  graph_from_data_frame(subset_type, directed = TRUE, vertices = subset_characters)

# create undirected subsetted version of graph
got_G_subset_undirected <-  graph_from_data_frame(subset_type, directed = FALSE, vertices = subset_characters)

# get adj mat
got_A_subset <- as.matrix(as_adj(got_G_subset))


# try a clustering algorithm!
ceb <- cluster_edge_betweenness(got_G_subset_undirected)
plot(ceb,
     got_G_subset_undirected,
     vertex.label = gsub(" ", "\n", V(got_G_subset_undirected)$name),
     vertex.shape = V(got_G_subset_undirected)$shape,
     vertex.size = (V(got_G_subset_undirected)$popularity + 0.5) * 5, 
     vertex.frame.color = "gray", 
     vertex.label.color = "black", 
     vertex.label.cex = 0.8)

Add in some graph attributes for plotting

color_vertices <- subset_characters %>%
  group_by(house2, color) %>%
  summarise(n = n()) %>%
  filter(!is.na(color))

colors_edges <- subset_type %>%
  group_by(type, color) %>%
  summarise(n = n()) %>%
  filter(!is.na(color))

layout <- layout_with_fr(got_G_subset)

plot(got_G_subset,
     layout = layout,
     vertex.label = NA,
     vertex.shape = V(got_G_subset)$shape,
     vertex.color = V(got_G_subset)$color, 
     vertex.size =  (V(got_G_subset)$popularity + 0.5) * 5, 
     vertex.frame.color = "gray", 
     vertex.label.color = "black", 
     vertex.label.cex = 0.8,
     edge.arrow.size = 0.5,
     edge.color = E(got_G_subset)$color,
     edge.lty = E(got_G_subset)$lty)

legend("topleft", legend = c(NA, "Node color:", as.character(color_vertices$house2), NA, "Edge color:", as.character(colors_edges$type)), pch = 19,
       col = c(NA, NA, color_vertices$color, NA, NA, colors_edges$color), pt.cex = 1, cex = 1, bty = "n", ncol = 1,
       title = "") 

Clustering with GoT

# Spectral clustering method
svdY <-svd(got_A_subset)
plot(svdY$d)

xhat <- svdY$v[,1:9]%*%diag(svdY$d[1:9]^(1/2))

kmeansGOT <- kmeans(xhat, centers = 9)
table(kmeansGOT$cluster)
## 
##  1  2  3  4  5  6  7  8  9 
## 11  5  5 84  9  7  7  8  5
uninformedGMM <-Mclust(xhat, 9)
table(uninformedGMM$classification)
## 
##  1  2  3  4  5  6  7  8  9 
##  5 10 84  7  5  9  7  8  6
edge_between        <- cluster_edge_betweenness(got_G_subset, directed = TRUE)
reduce_edge_between <- cut_at(edge_between,no = 9) # cut into two groups
table(reduce_edge_between)
## reduce_edge_between
##  1  2  3  4  5  6  7  8  9 
## 92 10 14  6  5  3  9  1  1
# plot by clustering labels
commGOT <- as.data.frame(cbind(kmeansGOT$cluster, uninformedGMM$classification))
colnames(commGOT) <- c("kmeans", "uninformedGMM")

commGOT <- mutate(commGOT, 
                  kmeans = as.factor(kmeans),
                  uninformedGMM = as.factor(uninformedGMM))

# make separate graph for looking at clustering results
got_G_subset_comm <- got_G_subset

# relevel for coloring
levels(commGOT$kmeans) <- unique(V(got_G_subset)$color)
levels(commGOT$uninformedGMM) <- unique(V(got_G_subset)$color)
V(got_G_subset_comm)$color_kmeans <- commGOT$kmeans
V(got_G_subset_comm)$color_gmm    <- commGOT$uninformedGMM



# K means plot
plot(got_G_subset_comm,
     layout = layout,
     vertex.label = NA,#gsub(" ", "\n", V(union_graph)$name),
     vertex.shape = V(got_G_subset_comm)$shape,
     vertex.color = V(got_G_subset_comm)$color_kmeans, 
     vertex.size =  (V(got_G_subset_comm)$popularity + 0.5) * 5, 
     vertex.frame.color = "gray", 
     vertex.label.color = "black", 
     vertex.label.cex = 0.8,
     edge.arrow.size = 0.5,
     edge.lty = E(got_G_subset_comm)$lty,
   main = "K-Means Clustering")

# uninformed GMM
plot(got_G_subset_comm,
     layout = layout,
     vertex.label = NA,
     vertex.shape = V(got_G_subset_comm)$shape,
     vertex.color = V(got_G_subset_comm)$color_gmm, 
     vertex.size =  (V(got_G_subset_comm)$popularity + 0.5) * 5, 
     vertex.frame.color = "gray", 
     vertex.label.color = "black", 
     vertex.label.cex = 0.8,
     edge.arrow.size = 0.5,
     edge.lty = E(got_G_subset_comm)$lty,
   main = "GMM Clustering")

Have some fun :)

Consider looking at centrality scores, degree distribution, different layouts! Try to understand what is going on with these clustering methods. Why might we be finding one big community? What are some ideas on how to fix this?

Extras

More Clustering Methods

Leading eigenvector

eigen_clust <- cluster_leading_eigen(karate)
set.seed(1)
plot(eigen_clust,karate)

# Tell it to find 2 communities
clusters <- cluster_leading_eigen(karate, steps = 1) #at most two 

Label propagation algorithm

The algorithm terminates when it holds for each node that it belongs to a community to which a maximum number of its neighbors also belong.

fixed: TRUE-label will not change. initial: initial point

#non-negative values: different labels; negative values: no labels
initial <- rep(-1,vcount(karate))
fixed   <- rep(FALSE,vcount(karate))
#need to have names
names(initial) <- names(fixed) <- V(karate)$name 
initial['Mr Hi']   <- 1
initial['John A']  <-2
fixed['Mr Hi'] <- fixed['John A'] <- TRUE
lab<- cluster_label_prop(karate,initial = initial,fixed = fixed)
set.seed(1)
plot(lab,karate)

plot(truth,karate)

Exercise: Calculate misclustered nodes for above two methods and compare to the edge betweenness method

Degenerate Behavior of the ERGM

Our model is completely defined once we have estimates for our coefficients; thus defining a probability distribution across networks of this size/density. If we have specified a model that fits our observed data well, then simulated networks from this model should be similar to the observed data.

g_use2 <- network(100,density=0.1,directed=FALSE,seed=1)
g_sim2 <- simulate(~edges+kstar(2),
                   nsim=1000, coef=c(-1,1),
                   basis=g_use2,
                   control=control.simulate(
                     MCMC.burnin=1000,
                     MCMC.interval=100),seed=2, statsonly = TRUE)

g_sim2 <- as.data.frame(g_sim2)
star2_simulResults <- g_sim2[,2]
edges_simulResults <- g_sim2[,1]

g_sim2MELT <- melt(g_sim2)
hist <- ggplot(g_sim2MELT) + geom_histogram(aes(value)) + facet_wrap(~variable, scales = "free")

g_sim2MELT$iter <- rep(seq(1, 1000, by = 1),2)
traces <- ggplot(g_sim2MELT) + geom_point(aes(x = iter, y = value)) +
  facet_wrap(~variable, scales = "free")

hist

traces

From the above plots, we notice that we are converging to a degenerate stationary distribution as we converge to a complete graph which is not very interesting. This is potentially occuring because addition of new edges is changing our log odds too much. For example, the addition of a new edge can lead to new 2-stars and thus appears to be leading to a substantial increase in log odds. We ending up missing the MLE. We need addition of statistics to lead to smaller changes in log odds to create a stable graph.

Heather Mathews

07 November, 2019